In [ ]:
import pandas as pd
import numpy as np
import os
from os.path import join
from joblib import Parallel, delayed
import sys
cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')
In [ ]:
file_format = 'csv'
# file_format = 'feather'
In [ ]:
%load_ext watermark
%watermark -iv -v
In [ ]:
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
In [ ]:
%aimport Data.data_extraction
from Data.data_extraction import import_group_epa, unit_conversion
%aimport Analysis.index
from Analysis.index import add_datetime, add_quarter
In [ ]:
start_year = 2001
end_year = 2017
if __name__ == '__main__':
base_path = join(data_path, 'EPA emissions')
paths = [join(base_path, 'EPA emissions {}.{}'.format(str(year), file_format))
for year in range(start_year, end_year + 1)]
df_list = Parallel(n_jobs=-1)(delayed(import_group_epa)(path)
for path in paths)
In [ ]:
df = pd.concat(df_list)
In [ ]:
path = os.path.join(data_path, 'Derived data',
'Monthly EPA emissions 2018-03-06.csv')
df.to_csv(path, index=False)
In [ ]: